In [1]:
#This notebook cleans data from a dataset with wine details and ratings (points),
#creating a model which can predict any wine's score using its characteristics

#The final model has an accuracy of 97.8% (mean error percentage 2.2%) and a median absolute error of 1.67

In [2]:
#IMPORT LIBRARIES

In [3]:
import os; import pandas as pd; import numpy as np

In [4]:
#LOAD DATA

In [5]:
master = pd.read_csv('wine_data_master.csv') #dataset with 8 variables, target is called 'points'

backup = master

In [6]:
#START RUNNING CODE FROM HERE

master = backup
master.shape

(280901, 10)

In [7]:
#region_2 is only present in US

master = backup.drop(columns='region_2')
master.shape

(280901, 9)

In [8]:
#remove INF and NaNs, replace with special value            #EDIT: just drop those rows since there's too many anyway

master = master.replace([np.inf, -np.inf], np.nan)
master = master.dropna()
master.shape

(147459, 9)

In [9]:
#reduce number of rows

master = master.sample(frac=0.35)
master.shape

(36865, 9)

In [10]:
#identify variables with highest nunique

master.nunique()

new_id         36865
country            7
designation    17782
points            21
price            253
province          65
region_1        1013
variety          401
winery          8029
dtype: int64

In [11]:
#IDENTIFY RELEVANT COLUMNS

data = master.columns.drop(['new_id', 'points'])
dataset = master[data]
master.shape

(36865, 9)

In [12]:
#reduce number of columns (after categorical -> indicator) by dropping variable with highest nunique

data = data.drop('designation')
# data = data.drop('winery')
dataset = master[data]
dataset.nunique()

country        7
price        253
province      65
region_1    1013
variety      401
winery      8029
dtype: int64

In [13]:
#use dummy encoder to convert categorical variables to indicators

nonNumeric = data.drop(dataset[data].select_dtypes('number').columns)

dataNon = dataset[nonNumeric]

dataDummy = pd.get_dummies(dataNon)

In [14]:
#replace categorical variables with indicator variables 

dataset = dataset.drop(columns=nonNumeric)

dataset[dataDummy.columns] = dataDummy

In [15]:
#update data then proceed to model

data = dataset.columns

In [16]:
#SPLIT INTO TRAIN AND TEST

from sklearn.model_selection import train_test_split

trainData, testData, trainTarget, testTarget = train_test_split(dataset, master['points'], 
                                                                test_size = 0.4, random_state = 42)


In [17]:
#sanity check

trainData.shape

(22119, 9516)

In [18]:
#   4-STEP MODELLING PROCESS: IMPORT WHICH MODEL, MAKE INSTANCE OF MODEL, TRAIN USING FIT, PREDICT LABELS OF TESTDATA

In [19]:
from xgboost import XGBRegressor

In [20]:
model = XGBRegressor()

In [21]:
model.fit(trainData, trainTarget)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
prediction = model.predict(testData)   #predict probabilities, for ROC and KS
prediction

array([85.92696 , 90.571655, 87.91298 , ..., 88.308784, 90.42008 ,
       88.951035], dtype=float32)

In [23]:
prediction.mean()

88.49686

In [24]:
#EVALUATE MODEL USING METRICS

In [25]:
model.score(trainData, trainTarget)

0.4243536148966621

In [38]:
#error percentage

((prediction - testTarget)/testTarget * 100).abs().mean()

2.2176406383514404

In [89]:
#side-by-side comparison

from sklearn.metrics import median_absolute_error as scr

scr(testTarget, prediction)

1.6660537719726562

In [90]:
from sklearn.metrics import mean_squared_log_error as scor
scor(testTarget, prediction)            #multioutput not necessary

0.0007635867654445912

In [91]:
from sklearn.metrics import explained_variance_score as scorev
scorev(testTarget, prediction)            #multioutput not necessary

0.4057245195381384